In [1]:

    
from em_utilities import *
import sframe as sf
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
from sklearn.neighbors import NearestNeighbors
import scipy
import time

Section 0:

Dataset definition and feature extraction (tf-idf)



In [2]:

    
dataset= sf.SFrame('Dataset/KO_data.csv')
dataset.remove_column('X1')
dataset= dataset.add_row_number()
dataset.rename({'id':'X1'})









    



[INFO] sframe.cython.cy_server: SFrame v2.1 started. Logging /tmp/sframe_server_1504805937.log






    



------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------






    




Finished parsing file /home/abdl-rahman/Desktop/Recommendation systems/EM for clustering/Dataset/KO_data.csv






    




Parsing completed. Parsed 1423 lines in 0.318258 secs.






    Out[2]:





    
        X1
        file_name
        category
        text
    
    
        0
        training-dataset/engineer
ing/912.txt ...
        engineering
        Uber s case for
incremental processin ...
    
    
        1
        training-
dataset/business/747.txt ...
        business
        On the Road to Recap Why
the Unicorn Financing ...
    
    
        2
        training-
dataset/product/919.txt ...
        product
        How designers can use
data to create amazing ...
    
    
        3
        training-
dataset/business/222.txt ...
        business
        The Arc of Company Life
and How to Prolong ItOn ...
    
    
        4
        training-
dataset/business/238.txt ...
        business
        Advice to Grads  Join A
Winning Startup  v  2016 ...
    
    
        5
        training-
dataset/product/297.txt ...
        product
        GV Guide to Design
Critique   GV LibraryGV ...
    
    
        6
        training-
dataset/product/1281.txt ...
        product
        Beating designer s
blockThose hours or days ...
    
    
        7
        training-
dataset/product/310.txt ...
        product
        How to create effective
push notificationsOver ...
    
    
        8
        training-
dataset/product/160.txt ...
        product
        Thoughtbot s Kyle Fiedler
Know yourself and trust ...
    
    
        9
        training-
dataset/product/92.txt ...
        product
        A product team s friend
or foe    Feature Req ...
    

[1423 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [3]:

    
tfidfvec= TfidfVectorizer(stop_words='english')
tf_idf_matrix= tfidfvec.fit_transform(dataset['text'])
tf_idf_matrix = normalize(tf_idf_matrix)

Section 1:

Model Parameters smart initialization

Used Kmeans++ model to initialize the parameters for the model of EM algorithm.

Kmeans++ used to initialize the means (Centroids of clusters)



In [4]:

    
#Smart Initialization for means with using KMeans++ model 
def initialize_means(num_clusters,features_matrix):
    from sklearn.cluster import KMeans
    np.random.seed(5)
    kmeans_model = KMeans(n_clusters=num_clusters, init='k-means++', n_init=5, max_iter=400, random_state=1, n_jobs=1)
    kmeans_model.fit(features_matrix)
    centroids, cluster_assignment = kmeans_model.cluster_centers_, kmeans_model.labels_
    means = [centroid for centroid in centroids]
    return [means , cluster_assignment]



In [5]:

    
#Smart initialization for weights
def initialize_weights(num_clusters,features_matrix,cluster_assignment):
    num_docs = features_matrix.shape[0]
    weights = []
    for i in xrange(num_clusters):
        num_assigned = len(cluster_assignment[cluster_assignment==i]) # YOUR CODE HERE
        w = float(num_assigned) / num_docs
        weights.append(w)
    return weights



In [6]:

    
#Smart initialization for covariances
def initialize_covs(num_clusters,features_matrix,cluster_assignment):
    covs = []
    for i in xrange(num_clusters):
        member_rows = features_matrix[cluster_assignment==i]
        cov = (member_rows.multiply(member_rows) - 2*member_rows.dot(diag(means[i]))).sum(axis=0).A1 / member_rows.shape[0] \
        + means[i]**2
        cov[cov < 1e-8] = 1e-8
        covs.append(cov)
    return covs

Section 2:

Training Models with different number of clusters

Initializing the parameters for each model then start training using the Expectation-Maximization algorithm.



In [7]:

    
# Model 1 with 10 clusters
(means , cluster_assignment_10model)= initialize_means(10,tf_idf_matrix)
covs= initialize_covs(10,tf_idf_matrix, cluster_assignment_10model)
weights= initialize_weights(10,tf_idf_matrix, cluster_assignment_10model)
model_em_10k= EM_for_high_dimension(tf_idf_matrix, means, covs, weights, cov_smoothing=1e-10)



In [8]:

    
# Model 2 with 20 clusters.
(means , cluster_assignment_20model)= initialize_means(20,tf_idf_matrix)
covs= initialize_covs(20,tf_idf_matrix, cluster_assignment_20model)
weights= initialize_weights(20,tf_idf_matrix, cluster_assignment_20model)
model_em_20k= EM_for_high_dimension(tf_idf_matrix, means, covs, weights, cov_smoothing=1e-10)

Section 3:

Evaluation report for each cluster (Interpreting clusters)

Evaluation report is divided into two partitions the first one is the word representation for each cluster the really interpret the cluster, the second one is for the variety of article types in one cluster counting each category for each cluster.



In [9]:

    
def visualize_EM_clusters(tf_idf, means, covs, map_index_to_word):
    print('')
    print('==========================================================')

    num_clusters = len(means)
    for c in xrange(num_clusters):
        print('Cluster {0:d}: Largest mean parameters in cluster '.format(c))
        print('\n{0: <12}{1: <12}{2: <12}'.format('Word', 'Mean', 'Variance'))
        
        # The k'th element of sorted_word_ids should be the index of the word 
        # that has the k'th-largest value in the cluster mean. Hint: Use np.argsort().
        sorted_word_ids = np.argsort(means[c])[::-1]

        for i in sorted_word_ids[:10]:
            print '{0: <12}{1:<10.2e}{2:10.2e}'.format(map_index_to_word[i], 
                                                       means[c][i],
                                                       covs[c][i])
        print '\n=========================================================='



In [10]:

    
def clusters_report(clusters_idx):
    cluster_id=0
    for cluster_indicies in clusters_idx:
        countP=0
        countB=0
        countE=0
        for i in cluster_indicies:
            if dataset['category'][i]=='product':
                countP+=1
            elif dataset['category'][i]=='engineering':
                countE+=1
            elif dataset['category'][i]=='business':
                countB+=1
        print "Cluster ",cluster_id ,"\n==========================\n"
        cluster_id+=1
        print "product count : ",countP ,"\nengineering count : ",countE,"\nbusiness count : ",countB , "\n"



In [11]:

    
visualize_EM_clusters(tf_idf_matrix, model_em_10k['means'], model_em_10k['covs'], tfidfvec.get_feature_names())









    



==========================================================
Cluster 0: Largest mean parameters in cluster 

Word        Mean        Variance    
company     5.94e-02    3.71e-03
team        5.82e-02    5.92e-03
people      5.53e-02    3.06e-03
startup     4.38e-02    4.79e-03
time        3.97e-02    8.44e-04
work        3.75e-02    2.13e-03
business    3.51e-02    2.63e-03
product     3.15e-02    1.48e-03
don         3.06e-02    1.23e-03
companies   3.04e-02    1.94e-03

==========================================================
Cluster 1: Largest mean parameters in cluster 

Word        Mean        Variance    
users       9.49e-02    8.42e-03
user        8.22e-02    7.33e-03
app         5.62e-02    1.19e-02
design      5.55e-02    3.78e-03
product     5.03e-02    3.25e-03
onboarding  4.32e-02    1.66e-02
mobile      4.22e-02    1.05e-02
use         3.58e-02    8.12e-04
content     3.37e-02    5.02e-03
people      3.20e-02    2.25e-03

==========================================================
Cluster 2: Largest mean parameters in cluster 

Word        Mean        Variance    
data        8.54e-02    1.29e-02
microservices6.97e-02    2.55e-02
serverless  4.64e-02    2.18e-02
code        4.38e-02    4.24e-03
services    4.33e-02    4.63e-03
service     4.30e-02    4.96e-03
database    3.67e-02    7.93e-03
architecture3.30e-02    2.35e-03
application 3.06e-02    1.76e-03
server      2.88e-02    2.92e-03

==========================================================
Cluster 3: Largest mean parameters in cluster 

Word        Mean        Variance    
xero        3.87e-01    3.82e-06
vet         2.43e-01    1.51e-06
institute   2.17e-01    1.20e-06
codigodelsur1.98e-01    1.00e-06
tech        1.78e-01    8.05e-07
veteran     1.66e-01    7.02e-07
accounting  1.55e-01    6.14e-07
firms       1.38e-01    4.88e-07
founder     1.30e-01    4.33e-07
highly      1.01e-01    2.59e-07

==========================================================
Cluster 4: Largest mean parameters in cluster 

Word        Mean        Variance    
netflix     6.53e-01    4.43e-03
blog        2.52e-01    7.47e-03
technology  1.76e-01    4.55e-03
regarding   1.62e-01    3.88e-03
perspectives1.59e-01    3.70e-03
tech        1.36e-01    3.53e-03
issues      1.02e-01    1.13e-03
challenges  1.00e-01    1.47e-03
focused     9.90e-02    1.44e-03
decisions   9.52e-02    1.33e-03

==========================================================
Cluster 5: Largest mean parameters in cluster 

Word        Mean        Variance    
rails       1.37e-01    5.11e-02
ruby        6.74e-02    1.20e-02
phoenix     5.48e-02    2.14e-02
elixir      5.14e-02    1.93e-02
language    3.78e-02    8.65e-03
akka        3.52e-02    1.34e-02
data        3.50e-02    3.00e-03
combinator  3.45e-02    3.33e-02
redirecting 3.45e-02    3.33e-02
quip        3.45e-02    3.33e-02

==========================================================
Cluster 6: Largest mean parameters in cluster 

Word        Mean        Variance    
people      2.38e-02    1.17e-03
product     2.19e-02    1.27e-03
like        1.91e-02    4.32e-04
new         1.79e-02    5.90e-04
companies   1.74e-02    1.13e-03
time        1.65e-02    4.72e-04
learning    1.64e-02    3.60e-03
value       1.60e-02    2.68e-03
facebook    1.59e-02    3.07e-03
business    1.57e-02    1.06e-03

==========================================================
Cluster 7: Largest mean parameters in cluster 

Word        Mean        Variance    
design      2.47e-01    1.52e-02
designers   1.07e-01    1.35e-02
team        7.60e-02    6.50e-03
sprint      6.79e-02    2.73e-02
product     5.76e-02    3.48e-03
work        5.57e-02    2.97e-03
designer    5.27e-02    2.61e-03
project     4.04e-02    4.25e-03
people      3.75e-02    1.56e-03
sprints     3.74e-02    9.59e-03

==========================================================
Cluster 8: Largest mean parameters in cluster 

Word        Mean        Variance    
product     2.44e-01    9.90e-03
customer    5.83e-02    6.51e-03
team        5.78e-02    4.46e-03
customers   5.65e-02    5.00e-03
manager     4.77e-02    5.05e-03
management  4.02e-02    4.19e-03
managers    3.89e-02    3.50e-03
market      3.73e-02    5.22e-03
products    3.62e-02    1.62e-03
people      3.38e-02    1.53e-03

==========================================================
Cluster 9: Largest mean parameters in cluster 

Word        Mean        Variance    
ux          2.04e-01    4.35e-02
design      1.42e-01    9.81e-03
designer    1.22e-01    1.99e-02
newslog     5.16e-02    2.23e-02
product     4.86e-02    4.48e-03
user        4.23e-02    2.88e-03
experience  3.84e-02    2.32e-03
designers   3.67e-02    1.69e-03
meets       3.58e-02    1.05e-02
ideas       3.56e-02    8.95e-03

==========================================================



In [12]:

    
visualize_EM_clusters(tf_idf_matrix, model_em_20k['means'], model_em_20k['covs'], tfidfvec.get_feature_names())









    



==========================================================
Cluster 0: Largest mean parameters in cluster 

Word        Mean        Variance    
design      2.93e-01    1.06e-02
designers   8.87e-02    1.02e-02
sprint      8.35e-02    3.31e-02
designer    7.68e-02    1.20e-02
team        6.71e-02    5.77e-03
product     5.61e-02    3.87e-03
sprints     4.57e-02    1.17e-02
work        4.15e-02    1.68e-03
process     3.70e-02    1.78e-03
people      3.29e-02    1.34e-03

==========================================================
Cluster 1: Largest mean parameters in cluster 

Word        Mean        Variance    
sketch      5.86e-02    2.19e-02
javascript  5.79e-02    1.60e-02
react       4.94e-02    1.69e-02
page        4.61e-02    5.32e-03
font        4.50e-02    1.87e-02
user        4.08e-02    2.93e-03
code        4.04e-02    2.74e-03
ember       3.48e-02    1.51e-02
fonts       3.42e-02    1.00e-02
like        3.27e-02    5.69e-04

==========================================================
Cluster 2: Largest mean parameters in cluster 

Word        Mean        Variance    
product     1.71e-01    7.63e-03
customer    1.04e-01    1.05e-02
customers   9.78e-02    7.32e-03
users       6.04e-02    6.33e-03
user        4.96e-02    4.58e-03
market      4.83e-02    7.52e-03
value       4.51e-02    6.08e-03
onboarding  4.15e-02    1.43e-02
marketing   4.06e-02    5.49e-03
pricing     3.86e-02    1.70e-02

==========================================================
Cluster 3: Largest mean parameters in cluster 

Word        Mean        Variance    
people      5.99e-02    3.65e-03
time        3.59e-02    1.10e-03
like        3.37e-02    9.41e-04
work        3.36e-02    2.79e-03
product     3.16e-02    1.69e-03
ve          2.99e-02    1.22e-03
things      2.88e-02    1.22e-03
don         2.87e-02    1.17e-03
just        2.75e-02    7.10e-04
company     2.74e-02    1.51e-03

==========================================================
Cluster 4: Largest mean parameters in cluster 

Word        Mean        Variance    
netflix     6.53e-01    4.43e-03
blog        2.52e-01    7.47e-03
technology  1.76e-01    4.55e-03
regarding   1.62e-01    3.88e-03
perspectives1.59e-01    3.70e-03
tech        1.36e-01    3.53e-03
issues      1.02e-01    1.13e-03
challenges  1.00e-01    1.47e-03
focused     9.90e-02    1.44e-03
decisions   9.52e-02    1.33e-03

==========================================================
Cluster 5: Largest mean parameters in cluster 

Word        Mean        Variance    
app         8.27e-02    1.70e-02
mobile      7.09e-02    1.57e-02
platform    5.48e-02    1.22e-02
users       5.33e-02    4.81e-03
facebook    5.24e-02    1.03e-02
apps        4.91e-02    5.26e-03
platforms   4.29e-02    8.52e-03
video       4.01e-02    1.16e-02
user        3.98e-02    2.75e-03
people      3.72e-02    1.88e-03

==========================================================
Cluster 6: Largest mean parameters in cluster 

Word        Mean        Variance    
xero        3.87e-01    3.82e-06
vet         2.43e-01    1.51e-06
institute   2.17e-01    1.20e-06
codigodelsur1.98e-01    1.00e-06
tech        1.78e-01    8.05e-07
veteran     1.66e-01    7.02e-07
accounting  1.55e-01    6.14e-07
firms       1.38e-01    4.88e-07
founder     1.30e-01    4.33e-07
highly      1.01e-01    2.59e-07

==========================================================
Cluster 7: Largest mean parameters in cluster 

Word        Mean        Variance    
serverless  4.78e-01    2.55e-02
faas        3.02e-01    1.01e-01
processing  1.55e-01    3.39e-02
task        9.23e-02    1.38e-02
functions   9.18e-02    3.01e-03
lambda      9.13e-02    6.25e-03
application 8.14e-02    3.03e-03
medium      7.89e-02    7.08e-02
paas        6.89e-02    5.53e-03
gateway     6.84e-02    5.04e-03

==========================================================
Cluster 8: Largest mean parameters in cluster 

Word        Mean        Variance    
startup     1.90e-01    7.94e-03
company     9.08e-02    6.92e-03
founders    8.96e-02    7.81e-03
founder     6.73e-02    6.78e-03
startups    6.46e-02    4.41e-03
business    6.22e-02    3.93e-03
product     5.70e-02    3.12e-03
people      5.34e-02    1.70e-03
growth      5.01e-02    1.01e-02
idea        4.96e-02    5.47e-03

==========================================================
Cluster 9: Largest mean parameters in cluster 

Word        Mean        Variance    
memory      1.97e-01    4.25e-02
virtual     4.67e-02    8.59e-03
file        4.59e-02    1.71e-02
cpu         3.86e-02    4.23e-03
disk        3.82e-02    5.06e-03
data        3.77e-02    1.41e-03
linux       3.73e-02    5.01e-03
code        3.59e-02    4.63e-03
ring        3.58e-02    1.56e-02
postgres    3.55e-02    1.93e-02

==========================================================
Cluster 10: Largest mean parameters in cluster 

Word        Mean        Variance    
ux          1.18e-01    3.41e-02
design      1.04e-01    5.87e-03
project     9.64e-02    1.74e-02
user        6.72e-02    8.70e-03
designers   5.59e-02    9.39e-03
users       5.17e-02    6.04e-03
designer    4.94e-02    3.67e-03
client      4.93e-02    9.98e-03
research    4.91e-02    8.98e-03
product     4.45e-02    3.46e-03

==========================================================
Cluster 11: Largest mean parameters in cluster 

Word        Mean        Variance    
data        2.05e-01    1.91e-02
database    5.26e-02    1.34e-02
kafka       4.14e-02    1.06e-02
use         2.97e-02    6.30e-04
user        2.88e-02    1.66e-03
metrics     2.71e-02    2.52e-03
analytics   2.70e-02    6.32e-03
schema      2.65e-02    3.64e-03
conversion  2.42e-02    7.70e-03
using       2.41e-02    5.14e-04

==========================================================
Cluster 12: Largest mean parameters in cluster 

Word        Mean        Variance    
trump       3.07e-01    3.17e-02
vcs         1.12e-01    5.56e-02
combinator  8.80e-02    6.97e-02
startup     7.11e-02    4.47e-02
hillary     6.86e-02    1.65e-02
lps         5.63e-02    3.80e-02
start       5.40e-02    2.50e-02
jonah       4.97e-02    2.97e-02
country     4.75e-02    5.00e-03
election    4.41e-02    1.43e-03

==========================================================
Cluster 13: Largest mean parameters in cluster 

Word        Mean        Variance    
investors   8.78e-02    9.17e-03
capital     6.44e-02    6.17e-03
company     6.04e-02    3.48e-03
companies   5.58e-02    3.84e-03
founders    5.50e-02    5.33e-03
market      5.19e-02    4.69e-03
business    4.94e-02    4.91e-03
startup     4.69e-02    2.87e-03
money       4.56e-02    3.50e-03
equity      4.48e-02    1.30e-02

==========================================================
Cluster 14: Largest mean parameters in cluster 

Word        Mean        Variance    
learning    1.30e-01    1.90e-02
machine     1.16e-01    1.53e-02
neural      5.77e-02    1.43e-02
data        5.42e-02    6.61e-03
deep        5.37e-02    5.25e-03
ai          4.53e-02    1.43e-02
intelligence4.53e-02    1.26e-02
computers   3.40e-02    6.70e-03
quantum     3.31e-02    1.99e-02
et          3.18e-02    1.40e-02

==========================================================
Cluster 15: Largest mean parameters in cluster 

Word        Mean        Variance    
microservices4.84e-02    1.92e-02
code        4.22e-02    6.33e-03
service     3.37e-02    4.40e-03
services    3.25e-02    3.91e-03
uber        2.99e-02    1.05e-02
data        2.40e-02    1.23e-03
new         2.06e-02    6.57e-04
architecture1.86e-02    1.52e-03
infrastructure1.85e-02    2.37e-03
time        1.82e-02    4.06e-04

==========================================================
Cluster 16: Largest mean parameters in cluster 

Word        Mean        Variance    
product     2.98e-01    1.15e-02
manager     8.90e-02    7.62e-03
management  7.96e-02    6.60e-03
managers    6.72e-02    5.66e-03
pm          5.06e-02    1.78e-02
team        4.99e-02    2.03e-03
development 4.44e-02    5.82e-03
products    3.66e-02    1.68e-03
role        3.36e-02    3.06e-03
people      2.84e-02    9.51e-04

==========================================================
Cluster 17: Largest mean parameters in cluster 

Word        Mean        Variance    
culture     1.66e-01    4.50e-02
teams       1.42e-01    2.06e-02
team        1.26e-01    1.12e-02
innovation  8.65e-02    2.56e-02
employees   8.13e-02    1.25e-02
company     6.95e-02    5.01e-03
people      5.61e-02    1.60e-03
work        4.80e-02    1.05e-03
ideas       4.79e-02    4.39e-03
innovative  4.39e-02    9.51e-03

==========================================================
Cluster 18: Largest mean parameters in cluster 

Word        Mean        Variance    
content     2.23e-01    2.19e-02
seo         6.51e-02    2.64e-02
blog        4.60e-02    8.35e-03
page        4.60e-02    4.04e-03
search      4.01e-02    1.16e-02
product     3.84e-02    3.18e-03
readers     3.56e-02    5.51e-03
notifications3.44e-02    2.23e-02
reader      3.06e-02    4.38e-03
gestures    3.00e-02    1.37e-02

==========================================================
Cluster 19: Largest mean parameters in cluster 

Word        Mean        Variance    
team        1.28e-01    8.92e-03
product     8.06e-02    5.43e-03
people      5.19e-02    2.33e-03
company     4.93e-02    2.57e-03
teams       4.72e-02    3.51e-03
time        4.32e-02    1.15e-03
work        3.97e-02    1.15e-03
says        3.90e-02    4.63e-03
new         3.03e-02    9.49e-04
goals       2.72e-02    6.12e-03

==========================================================



In [13]:

    
# No. of articles in each cluster for first model with 10 clusters
resps_10k= sf.SFrame(model_em_10k['resp'])
resps_10k= resps_10k.unpack('X1', '')
cluster_id=0
cluster_hash_10model = {}
for col in resps_10k.column_names():
    cluster_10k= np.array(resps_10k[col])
    print "cluster ",cluster_id , "assignments: ", cluster_10k.sum()
    cluster_hash_10model[cluster_id] =cluster_10k.nonzero() 
    cluster_id+=1









    



cluster  0 assignments:  359.0
cluster  1 assignments:  135.0
cluster  2 assignments:  139.0
cluster  3 assignments:  11.0
cluster  4 assignments:  26.0
cluster  5 assignments:  29.0
cluster  6 assignments:  365.0
cluster  7 assignments:  92.0
cluster  8 assignments:  230.0
cluster  9 assignments:  37.0



In [14]:

    
# No. of articles in each cluster for second model with 20 clusters
resps_20k= sf.SFrame(model_em_20k['resp'])
resps_20k= resps_20k.unpack('X1', '')
cluster_id=0
cluster_hash_20model = {}
for col in resps_20k.column_names():
    cluster_20k= np.array(resps_20k[col])
    print "cluster ",cluster_id , "assignments: ", cluster_20k.sum()
    cluster_hash_20model[cluster_id] =cluster_20k.nonzero() 
    cluster_id+=1









    



cluster  0 assignments:  73.0
cluster  1 assignments:  43.0
cluster  2 assignments:  141.0
cluster  3 assignments:  257.0
cluster  4 assignments:  26.0
cluster  5 assignments:  86.0
cluster  6 assignments:  11.0
cluster  7 assignments:  13.0
cluster  8 assignments:  39.0
cluster  9 assignments:  17.0
cluster  10 assignments:  62.0
cluster  11 assignments:  48.0
cluster  12 assignments:  13.0
cluster  13 assignments:  95.0
cluster  14 assignments:  37.0
cluster  15 assignments:  189.0
cluster  16 assignments:  99.0
cluster  17 assignments:  23.0
cluster  18 assignments:  21.0
cluster  19 assignments:  130.0



In [15]:

    
# Articles' categories in model 1 with 10 clusters
clusters_10k_idx=[]
for col in resps_10k.column_names():
    cluster_10k= np.array(resps_10k[col])
    cluster_10k= cluster_10k.nonzero()[0]
    clusters_10k_idx.append(cluster_10k)
clusters_report(clusters_10k_idx)









    



Cluster  0 
==========================

product count :  53 
engineering count :  7 
business count :  299 

Cluster  1 
==========================

product count :  105 
engineering count :  17 
business count :  13 

Cluster  2 
==========================

product count :  3 
engineering count :  131 
business count :  5 

Cluster  3 
==========================

product count :  0 
engineering count :  0 
business count :  11 

Cluster  4 
==========================

product count :  1 
engineering count :  24 
business count :  1 

Cluster  5 
==========================

product count :  0 
engineering count :  27 
business count :  3 

Cluster  6 
==========================

product count :  94 
engineering count :  85 
business count :  186 

Cluster  7 
==========================

product count :  82 
engineering count :  0 
business count :  10 

Cluster  8 
==========================

product count :  182 
engineering count :  1 
business count :  47 

Cluster  9 
==========================

product count :  32 
engineering count :  2 
business count :  3



In [16]:

    
# Articles' categories in model 2 with 20 clusters
clusters_20k_idx=[]
for col in resps_20k.column_names():
    cluster_20k= np.array(resps_20k[col])
    cluster_20k= cluster_20k.nonzero()[0]
    clusters_20k_idx.append(cluster_20k)
clusters_report(clusters_20k_idx)









    



Cluster  0 
==========================

product count :  66 
engineering count :  2 
business count :  5 

Cluster  1 
==========================

product count :  18 
engineering count :  24 
business count :  1 

Cluster  2 
==========================

product count :  93 
engineering count :  0 
business count :  48 

Cluster  3 
==========================

product count :  85 
engineering count :  5 
business count :  167 

Cluster  4 
==========================

product count :  1 
engineering count :  24 
business count :  1 

Cluster  5 
==========================

product count :  32 
engineering count :  9 
business count :  45 

Cluster  6 
==========================

product count :  0 
engineering count :  0 
business count :  11 

Cluster  7 
==========================

product count :  1 
engineering count :  12 
business count :  0 

Cluster  8 
==========================

product count :  0 
engineering count :  0 
business count :  39 

Cluster  9 
==========================

product count :  1 
engineering count :  16 
business count :  0 

Cluster  10 
==========================

product count :  53 
engineering count :  1 
business count :  8 

Cluster  11 
==========================

product count :  9 
engineering count :  34 
business count :  5 

Cluster  12 
==========================

product count :  4 
engineering count :  0 
business count :  9 

Cluster  13 
==========================

product count :  1 
engineering count :  0 
business count :  94 

Cluster  14 
==========================

product count :  6 
engineering count :  22 
business count :  9 

Cluster  15 
==========================

product count :  17 
engineering count :  137 
business count :  35 

Cluster  16 
==========================

product count :  98 
engineering count :  0 
business count :  1 

Cluster  17 
==========================

product count :  6 
engineering count :  2 
business count :  15 

Cluster  18 
==========================

product count :  12 
engineering count :  1 
business count :  8 

Cluster  19 
==========================

product count :  49 
engineering count :  5 
business count :  76

Section 4

Recommendation and predictions for Articles

Recommendation method:

A method for recommending articles by retrieving the cluster that the article belong to, then fetch all the articles in that cluster articles passed to nearest neighbour model to find the best 10 articles recommended for this article.

Predicting method:

Sending set of articles to predict the cluster it belong based on the trained data

Using the test dataset to predict cluster for each one using two different models.



In [17]:

    
def articles_inds(article_id , cluster_hash_model):
    for cluster_id in cluster_hash_model: 
        np_array = np.array(cluster_hash_model[cluster_id])
        if article_id in np_array:
            return cluster_id, np_array



In [18]:

    
def recommender(article_id ,cluster_hash_model, no_articles, data_articles):
    start_time = time.time()
    cid , inds = articles_inds(article_id ,cluster_hash_model)
    cluster_articles= data_articles.filter_by(inds[0] , 'X1')
    cluster_articles = cluster_articles.add_row_number()

    recom_vec= TfidfVectorizer(stop_words='english')
    tfidf_recommend= recom_vec.fit_transform(cluster_articles['text'])
    tfidf_recommend = normalize(tfidf_recommend)
    
    row_id = cluster_articles[cluster_articles['X1']==article_id]['id'][0]
    NN_model = NearestNeighbors(n_neighbors=no_articles).fit(tfidf_recommend)
    distances, indices = NN_model.kneighbors(tfidf_recommend[row_id])
    
    recommended_ids=[]
    for i in indices[0]:
        recommended_ids.append(cluster_articles[cluster_articles['id']==i]['X1'][0])
    
    del cluster_articles
    del tfidf_recommend
    del recom_vec
    #print("--- %s seconds ---" % (time.time() - start_time))
    #print len(inds[0])
    return recommended_ids



In [19]:

    
def predict_cluster(articles,em_model):
    article_tfidf= tfidfvec.transform(articles['text'])
    mu= deepcopy(em_model['means'])
    sigma= deepcopy(em_model['covs'])
    assignments=[]
    for j in range(article_tfidf.shape[0]):
        resps=[]
        for i in range(len(em_model['weights'])):
            predict= np.log(em_model['weights'][i]) + logpdf_diagonal_gaussian(article_tfidf[j], mu[i],sigma[i])
            resps.append(predict)
        assignments.append(resps.index(np.max(resps)))
    return assignments



In [21]:

    
# Recommend articles for all dataset then append it into the SFrame database then export it.
recommended_inds = []
start_time = time.time()
for i in range(len(dataset)):
    recommended_inds.append(recommender(i,cluster_hash_20model,11,dataset))

print("--- %s seconds (Final time complexity): ---" % (time.time() - start_time))









    



--- 517.98885417 seconds (Final time complexity): ---



In [22]:

    
rec_inds= sf.SArray(recommended_inds)
dataset.add_column(rec_inds,name='recommendations')









    Out[22]:





    
        X1
        file_name
        category
        text
        recommendations
    
    
        0
        training-dataset/engineer
ing/912.txt ...
        engineering
        Uber s case for
incremental processin ...
        [0.0, 334.0, 1289.0,
638.0, 1414.0, 413.0, ...
    
    
        1
        training-
dataset/business/747.txt ...
        business
        On the Road to Recap Why
the Unicorn Financing ...
        [1.0, 1378.0, 545.0,
398.0, 1238.0, 752.0, ...
    
    
        2
        training-
dataset/product/919.txt ...
        product
        How designers can use
data to create amazing ...
        [2.0, 740.0, 254.0,
397.0, 1331.0, 1139.0, ...
    
    
        3
        training-
dataset/business/222.txt ...
        business
        The Arc of Company Life
and How to Prolong ItOn ...
        [3.0, 432.0, 111.0, 1.0,
752.0, 621.0, 1317.0, ...
    
    
        4
        training-
dataset/business/238.txt ...
        business
        Advice to Grads  Join A
Winning Startup  v  2016 ...
        [4.0, 890.0, 1086.0,
572.0, 281.0, 707.0, ...
    
    
        5
        training-
dataset/product/297.txt ...
        product
        GV Guide to Design
Critique   GV LibraryGV ...
        [5.0, 1236.0, 207.0,
235.0, 1251.0, 523.0, ...
    
    
        6
        training-
dataset/product/1281.txt ...
        product
        Beating designer s
blockThose hours or days ...
        [6.0, 1316.0, 25.0,
1280.0, 609.0, 1145.0, ...
    
    
        7
        training-
dataset/product/310.txt ...
        product
        How to create effective
push notificationsOver ...
        [7.0, 209.0, 113.0,
924.0, 1161.0, 853.0, ...
    
    
        8
        training-
dataset/product/160.txt ...
        product
        Thoughtbot s Kyle Fiedler
Know yourself and trust ...
        [8.0, 862.0, 523.0,
879.0, 1024.0, 1251.0, ...
    
    
        9
        training-
dataset/product/92.txt ...
        product
        A product team s friend
or foe    Feature Req ...
        [9.0, 1116.0, 507.0,
121.0, 692.0, 605.0, ...
    

[1423 rows x 5 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [23]:

    
dataset.save('Articles_with_recommendations.csv',format='csv')



In [24]:

    
#Saving each cluster data in a seperate CSV file
for cluster_id in cluster_hash_20model:
    ind= np.array(cluster_hash_20model[cluster_id])
    #print ind
    cluster_articles= dataset.filter_by(ind[0] , 'X1')
    cluster_articles.save('Clusters_model20/cluster_'+str(cluster_id)+'.csv',format='csv')
    del cluster_articles

Testing data for cluster assigning.



In [25]:

    
testset = sf.SFrame('Dataset/KO_articles_test.csv')









    




Finished parsing file /home/abdl-rahman/Desktop/Recommendation systems/EM for clustering/Dataset/KO_articles_test.csv






    




Parsing completed. Parsed 97 lines in 0.099565 secs.






    



------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------






    




Finished parsing file /home/abdl-rahman/Desktop/Recommendation systems/EM for clustering/Dataset/KO_articles_test.csv






    




Parsing completed. Parsed 97 lines in 0.062474 secs.



In [26]:

    
test_tfidf= tfidfvec.transform(testset['text'])
# Predict Using model with 10 clusters.
test_predictions= predict_cluster(testset,model_em_10k)
test_predictions= np.array(test_predictions)
test_predictions









    Out[26]:





array([0, 6, 0, 0, 6, 0, 0, 0, 0, 0, 0, 6, 6, 0, 0, 0, 0, 0, 2, 8, 1, 2, 2,
       2, 2, 0, 0, 6, 1, 0, 2, 0, 1, 7, 2, 2, 2, 2, 2, 2, 6, 5, 0, 2, 2, 6,
       2, 1, 2, 2, 2, 2, 0, 0, 6, 1, 0, 2, 0, 1, 7, 2, 2, 2, 2, 2, 2, 6, 5,
       0, 2, 2, 6, 2, 6, 1, 0, 6, 0, 0, 0, 6, 6, 0, 0, 6, 0, 0, 8, 6, 0, 6,
       1, 8, 0, 0, 0])



In [27]:

    
# Predict Using model with 20 clusters.
test_predictions= predict_cluster(testset,model_em_20k)
test_predictions= np.array(test_predictions)
test_predictions









    Out[27]:





array([13, 15,  5, 15, 13,  3, 19,  3,  3,  2,  3,  3, 15,  3, 19,  3, 19,
        3, 15,  2, 15, 11, 11, 15, 15, 13,  3, 14,  1, 15, 15, 15, 15,  2,
       15, 15, 15, 15, 15,  0, 15, 10, 15, 15, 15, 15, 15, 15, 11, 11, 15,
       15, 13,  3, 14,  1, 15, 15, 15, 15,  2, 15, 15, 15, 15, 15,  0, 15,
       10, 15, 15, 15, 15, 15,  3,  1,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        5, 19, 19,  2,  3, 19,  5, 15, 16,  3,  2,  3])



In [ ]:

X1	file_name	category	text
0	training-dataset/engineer ing/912.txt ...	engineering	Uber s case for incremental processin ...
1	training- dataset/business/747.txt ...	business	On the Road to Recap Why the Unicorn Financing ...
2	training- dataset/product/919.txt ...	product	How designers can use data to create amazing ...
3	training- dataset/business/222.txt ...	business	The Arc of Company Life and How to Prolong ItOn ...
4	training- dataset/business/238.txt ...	business	Advice to Grads Join A Winning Startup v 2016 ...
5	training- dataset/product/297.txt ...	product	GV Guide to Design Critique GV LibraryGV ...
6	training- dataset/product/1281.txt ...	product	Beating designer s blockThose hours or days ...
7	training- dataset/product/310.txt ...	product	How to create effective push notificationsOver ...
8	training- dataset/product/160.txt ...	product	Thoughtbot s Kyle Fiedler Know yourself and trust ...
9	training- dataset/product/92.txt ...	product	A product team s friend or foe Feature Req ...